{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Engineer features and convert time series data to images"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports & Settings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To install `talib` with Python 3.7 follow [these](https://medium.com/@joelzhang/install-ta-lib-in-python-3-7-51219acacafb) instructions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:15.760121Z",
     "start_time": "2020-06-21T20:37:15.758040Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.741666Z",
     "start_time": "2020-06-21T20:37:15.761936Z"
    }
   },
   "outputs": [],
   "source": [
    "from talib import (RSI, BBANDS, MACD,\n",
    "                   NATR, WILLR, WMA,\n",
    "                   EMA, SMA, CCI, CMO,\n",
    "                   MACD, PPO, ROC,\n",
    "                   ADOSC, ADX, MOM)\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from statsmodels.regression.rolling import RollingOLS\n",
    "import statsmodels.api as sm\n",
    "import pandas_datareader.data as web\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.744601Z",
     "start_time": "2020-06-21T20:37:16.742782Z"
    }
   },
   "outputs": [],
   "source": [
    "DATA_STORE = '../data/assets.h5'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.753382Z",
     "start_time": "2020-06-21T20:37:16.746915Z"
    }
   },
   "outputs": [],
   "source": [
    "MONTH = 21\n",
    "YEAR = 12 * MONTH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.761731Z",
     "start_time": "2020-06-21T20:37:16.754935Z"
    }
   },
   "outputs": [],
   "source": [
    "START = '2000-01-01'\n",
    "END = '2017-12-31'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.770374Z",
     "start_time": "2020-06-21T20:37:16.763881Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.set_style('whitegrid')\n",
    "idx = pd.IndexSlice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.778854Z",
     "start_time": "2020-06-21T20:37:16.772408Z"
    }
   },
   "outputs": [],
   "source": [
    "T = [1, 5, 10, 21, 42, 63]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_path = Path('results', 'cnn_for_trading')\n",
    "if not results_path.exists():\n",
    "    results_path.mkdir(parents=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Quandl Wiki Stock Prices & Meta Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:16.787005Z",
     "start_time": "2020-06-21T20:37:16.780022Z"
    }
   },
   "outputs": [],
   "source": [
    "adj_ohlcv = ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:24.629115Z",
     "start_time": "2020-06-21T20:37:16.788061Z"
    }
   },
   "outputs": [],
   "source": [
    "with pd.HDFStore(DATA_STORE) as store:\n",
    "    prices = (store['quandl/wiki/prices']\n",
    "              .loc[idx[START:END, :], adj_ohlcv]\n",
    "              .rename(columns=lambda x: x.replace('adj_', ''))\n",
    "              .swaplevel()\n",
    "              .sort_index()\n",
    "             .dropna())\n",
    "    metadata = (store['us_equities/stocks'].loc[:, ['marketcap', 'sector']])\n",
    "ohlcv = prices.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:24.686712Z",
     "start_time": "2020-06-21T20:37:24.629969Z"
    }
   },
   "outputs": [],
   "source": [
    "prices.volume /= 1e3\n",
    "prices.index.names = ['symbol', 'date']\n",
    "metadata.index.name = 'symbol'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rolling universe: pick 500 most-traded stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:28.053157Z",
     "start_time": "2020-06-21T20:37:24.687633Z"
    }
   },
   "outputs": [],
   "source": [
    "dollar_vol = prices.close.mul(prices.volume).unstack('symbol').sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:28.144127Z",
     "start_time": "2020-06-21T20:37:28.054189Z"
    }
   },
   "outputs": [],
   "source": [
    "years = sorted(np.unique([d.year for d in prices.index.get_level_values('date').unique()]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:37:28.146595Z",
     "start_time": "2020-06-21T20:37:28.144928Z"
    }
   },
   "outputs": [],
   "source": [
    "train_window = 5 # years\n",
    "universe_size = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:53.551501Z",
     "start_time": "2020-06-21T20:37:28.148119Z"
    }
   },
   "outputs": [],
   "source": [
    "universe = []\n",
    "for i, year in enumerate(years[5:], 5):\n",
    "    start = str(years[i-5])\n",
    "    end = str(years[i])\n",
    "    most_traded = dollar_vol.loc[start:end, :].dropna(thresh=1000, axis=1).median().nlargest(universe_size).index\n",
    "    universe.append(prices.loc[idx[most_traded, start:end], :])\n",
    "universe = pd.concat(universe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:53.908788Z",
     "start_time": "2020-06-21T20:40:53.552659Z"
    }
   },
   "outputs": [],
   "source": [
    "universe = universe.loc[~universe.index.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:53.969761Z",
     "start_time": "2020-06-21T20:40:53.910704Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2530228 entries, ('A', Timestamp('2000-01-03 00:00:00')) to ('ZTS', Timestamp('2017-12-29 00:00:00'))\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   open    2530228 non-null  float64\n",
      " 1   close   2530228 non-null  float64\n",
      " 2   low     2530228 non-null  float64\n",
      " 3   high    2530228 non-null  float64\n",
      " 4   volume  2530228 non-null  float64\n",
      "dtypes: float64(5)\n",
      "memory usage: 106.2+ MB\n"
     ]
    }
   ],
   "source": [
    "universe.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:54.023517Z",
     "start_time": "2020-06-21T20:40:53.970896Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     735.000000\n",
       "mean     3442.487075\n",
       "std      1145.365643\n",
       "min      1043.000000\n",
       "25%      2368.000000\n",
       "50%      3792.000000\n",
       "75%      4527.000000\n",
       "max      4528.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "universe.groupby('symbol').size().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:54.120534Z",
     "start_time": "2020-06-21T20:40:54.024686Z"
    }
   },
   "outputs": [],
   "source": [
    "universe.to_hdf('data.h5', 'universe')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate Technical Indicators Factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:40:54.123706Z",
     "start_time": "2020-06-21T20:40:54.121876Z"
    }
   },
   "outputs": [],
   "source": [
    "T = list(range(6, 21))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Relative Strength Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:41:47.025956Z",
     "start_time": "2020-06-21T20:40:54.124895Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_RSI'] = universe.groupby(level='symbol').close.apply(RSI, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Williams %R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:42:40.021599Z",
     "start_time": "2020-06-21T20:41:47.026892Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_WILLR'] = (universe.groupby(level='symbol', group_keys=False)\n",
    "     .apply(lambda x: WILLR(x.high, x.low, x.close, timeperiod=t)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute Bollinger Bands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:42:40.028088Z",
     "start_time": "2020-06-21T20:42:40.024104Z"
    }
   },
   "outputs": [],
   "source": [
    "def compute_bb(close, timeperiod):\n",
    "    high, mid, low = BBANDS(close, timeperiod=timeperiod)\n",
    "    return pd.DataFrame({f'{timeperiod:02}_BBH': high, f'{timeperiod:02}_BBL': low}, index=close.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:43:35.628369Z",
     "start_time": "2020-06-21T20:42:40.029510Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    bbh, bbl = f'{t:02}_BBH', f'{t:02}_BBL'\n",
    "    universe = (universe.join(\n",
    "        universe.groupby(level='symbol').close.apply(compute_bb,\n",
    "                                                     timeperiod=t)))\n",
    "    universe[bbh] = universe[bbh].sub(universe.close).div(universe[bbh]).apply(np.log1p)\n",
    "    universe[bbl] = universe.close.sub(universe[bbl]).div(universe.close).apply(np.log1p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Normalized Average True Range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:44:37.932288Z",
     "start_time": "2020-06-21T20:43:35.629528Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_NATR'] = universe.groupby(level='symbol', \n",
    "                                group_keys=False).apply(lambda x: \n",
    "                                                        NATR(x.high, x.low, x.close, timeperiod=t))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Percentage Price Oscillator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:45:32.493604Z",
     "start_time": "2020-06-21T20:44:37.933393Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_PPO'] = universe.groupby(level='symbol').close.apply(PPO, fastperiod=t, matype=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Moving Average Convergence/Divergence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:45:32.497583Z",
     "start_time": "2020-06-21T20:45:32.494631Z"
    }
   },
   "outputs": [],
   "source": [
    "def compute_macd(close, signalperiod):\n",
    "    macd = MACD(close, signalperiod=signalperiod)[0]\n",
    "    return (macd - np.mean(macd))/np.std(macd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:46:29.865618Z",
     "start_time": "2020-06-21T20:45:32.498708Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_MACD'] = (universe\n",
    "                  .groupby('symbol', group_keys=False)\n",
    "                  .close\n",
    "                  .apply(compute_macd, signalperiod=t))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Momentum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:47:23.848242Z",
     "start_time": "2020-06-21T20:46:29.866751Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_MOM'] = universe.groupby(level='symbol').close.apply(MOM, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Weighted Moving Average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:48:21.394249Z",
     "start_time": "2020-06-21T20:47:23.849279Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_WMA'] = universe.groupby(level='symbol').close.apply(WMA, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exponential Moving Average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:49:17.626017Z",
     "start_time": "2020-06-21T20:48:21.395114Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_EMA'] = universe.groupby(level='symbol').close.apply(EMA, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Commodity Channel Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:50:42.056961Z",
     "start_time": "2020-06-21T20:49:17.627010Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:    \n",
    "    universe[f'{t:02}_CCI'] = (universe.groupby(level='symbol', group_keys=False)\n",
    "     .apply(lambda x: CCI(x.high, x.low, x.close, timeperiod=t)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Chande Momentum Oscillator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:51:45.162684Z",
     "start_time": "2020-06-21T20:50:42.058009Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_CMO'] = universe.groupby(level='symbol').close.apply(CMO, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-20T02:28:43.696896Z",
     "start_time": "2020-03-20T02:28:43.694227Z"
    }
   },
   "source": [
    "### Rate of Change"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rate of change is a technical indicator that illustrates the speed of price change over a period of time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:52:47.158726Z",
     "start_time": "2020-06-21T20:51:45.163633Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_ROC'] = universe.groupby(level='symbol').close.apply(ROC, timeperiod=t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Chaikin A/D Oscillator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:54:23.006176Z",
     "start_time": "2020-06-21T20:52:47.159827Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_ADOSC'] = (universe.groupby(level='symbol', group_keys=False)\n",
    "     .apply(lambda x: ADOSC(x.high, x.low, x.close, x.volume, fastperiod=t-3, slowperiod=4+t)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Average Directional Movement Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:00.850560Z",
     "start_time": "2020-06-21T20:54:23.007088Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in T:\n",
    "    universe[f'{t:02}_ADX'] = universe.groupby(level='symbol', \n",
    "                                group_keys=False).apply(lambda x: \n",
    "                                                        ADX(x.high, x.low, x.close, timeperiod=t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:07.524982Z",
     "start_time": "2020-06-21T20:56:00.851652Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "universe.drop(ohlcv, axis=1).to_hdf('data.h5', 'features')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute Historical Returns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Historical Returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:08.226871Z",
     "start_time": "2020-06-21T20:56:07.526384Z"
    }
   },
   "outputs": [],
   "source": [
    "by_sym = universe.groupby(level='symbol').close\n",
    "for t in [1,5]:\n",
    "    universe[f'r{t:02}'] = by_sym.pct_change(t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:10.403800Z",
     "start_time": "2020-06-21T20:56:08.227895Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>r01</th>\n",
       "      <th>r05</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2.529493e+06</td>\n",
       "      <td>2.526553e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>6.710840e-04</td>\n",
       "      <td>3.293540e-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.875355e-02</td>\n",
       "      <td>6.344951e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-9.718670e-01</td>\n",
       "      <td>-9.795396e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-1.034141e-02</td>\n",
       "      <td>-2.246575e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.236246e-04</td>\n",
       "      <td>2.921130e-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.122661e-02</td>\n",
       "      <td>2.811951e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.216425e+01</td>\n",
       "      <td>1.252657e+01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                r01           r05\n",
       "count  2.529493e+06  2.526553e+06\n",
       "mean   6.710840e-04  3.293540e-03\n",
       "std    2.875355e-02  6.344951e-02\n",
       "min   -9.718670e-01 -9.795396e-01\n",
       "25%   -1.034141e-02 -2.246575e-02\n",
       "50%    3.236246e-04  2.921130e-03\n",
       "75%    1.122661e-02  2.811951e-02\n",
       "max    1.216425e+01  1.252657e+01"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "universe[[f'r{t:02}' for t in [1, 5]]].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:10.411970Z",
     "start_time": "2020-06-21T20:56:10.405080Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outliers = universe[universe.r01>1].index.get_level_values('symbol').unique()\n",
    "len(outliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:23.267374Z",
     "start_time": "2020-06-21T20:56:10.413503Z"
    }
   },
   "outputs": [],
   "source": [
    "universe = universe.drop(outliers, level='symbol')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Historical return quantiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:36.529808Z",
     "start_time": "2020-06-21T20:56:23.268391Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in [1, 5]:\n",
    "    universe[f'r{t:02}dec'] = (universe[f'r{t:02}'].groupby(level='date')\n",
    "             .apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rolling Factor Betas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:38.057297Z",
     "start_time": "2020-06-21T20:56:36.531100Z"
    }
   },
   "outputs": [],
   "source": [
    "factor_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', \n",
    "                              start=START)[0].rename(columns={'Mkt-RF': 'Market'}))\n",
    "factor_data.index.names = ['date']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:38.064467Z",
     "start_time": "2020-06-21T20:56:38.058469Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 5114 entries, 2000-01-03 to 2020-04-30\n",
      "Data columns (total 6 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   Market  5114 non-null   float64\n",
      " 1   SMB     5114 non-null   float64\n",
      " 2   HML     5114 non-null   float64\n",
      " 3   RMW     5114 non-null   float64\n",
      " 4   CMA     5114 non-null   float64\n",
      " 5   RF      5114 non-null   float64\n",
      "dtypes: float64(6)\n",
      "memory usage: 279.7 KB\n"
     ]
    }
   ],
   "source": [
    "factor_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T20:56:38.075270Z",
     "start_time": "2020-06-21T20:56:38.065435Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "windows = list(range(15, 90, 5))\n",
    "len(windows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:16:36.122535Z",
     "start_time": "2020-06-21T20:56:38.076692Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "t = 1\n",
    "ret = f'r{t:02}'\n",
    "factors = ['Market', 'SMB', 'HML', 'RMW', 'CMA']\n",
    "windows = list(range(15, 90, 5))\n",
    "for window in windows:\n",
    "    print(window)\n",
    "    betas = []\n",
    "    for symbol, data in universe.groupby(level='symbol'):\n",
    "        model_data = data[[ret]].merge(factor_data, on='date').dropna()\n",
    "        model_data[ret] -= model_data.RF\n",
    "\n",
    "        rolling_ols = RollingOLS(endog=model_data[ret], \n",
    "                                 exog=sm.add_constant(model_data[factors]), window=window)\n",
    "        factor_model = rolling_ols.fit(params_only=True).params.drop('const', axis=1)\n",
    "        result = factor_model.assign(symbol=symbol).set_index('symbol', append=True)\n",
    "        betas.append(result)\n",
    "    betas = pd.concat(betas).rename(columns=lambda x: f'{window:02}_{x}')\n",
    "    universe = universe.join(betas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute Forward Returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:17:06.727614Z",
     "start_time": "2020-06-21T21:16:36.123397Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in [1, 5]:\n",
    "    universe[f'r{t:02}_fwd'] = universe.groupby(level='symbol')[f'r{t:02}'].shift(-t)\n",
    "    universe[f'r{t:02}dec_fwd'] = universe.groupby(level='symbol')[f'r{t:02}dec'].shift(-t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Store Model Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:17:14.589697Z",
     "start_time": "2020-06-21T21:17:06.728460Z"
    }
   },
   "outputs": [],
   "source": [
    "universe = universe.drop(ohlcv, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:17:14.636312Z",
     "start_time": "2020-06-21T21:17:14.590596Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 2499265 entries, ('A', Timestamp('2000-01-03 00:00:00')) to ('ZTS', Timestamp('2017-12-29 00:00:00'))\n",
      "Columns: 308 entries, 06_RSI to r05dec_fwd\n",
      "dtypes: float64(308)\n",
      "memory usage: 5.7+ GB\n"
     ]
    }
   ],
   "source": [
    "universe.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:17:14.640453Z",
     "start_time": "2020-06-21T21:17:14.637667Z"
    }
   },
   "outputs": [],
   "source": [
    "drop_cols = ['r01', 'r01dec', 'r05',  'r05dec']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:17:14.668654Z",
     "start_time": "2020-06-21T21:17:14.641741Z"
    }
   },
   "outputs": [],
   "source": [
    "outcomes = universe.filter(like='_fwd').columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T21:18:51.378227Z",
     "start_time": "2020-06-21T21:17:14.669468Z"
    }
   },
   "outputs": [],
   "source": [
    "universe = universe.sort_index()\n",
    "with pd.HDFStore('data.h5') as store:\n",
    "    store.put('features', universe.drop(drop_cols, axis=1).drop(outcomes, axis=1).loc[idx[:, '2001':], :])\n",
    "    store.put('targets', universe.loc[idx[:, '2001':], outcomes])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "341.391px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
